# Import the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
# Import dataset
df = sns.load_dataset('iris')
print(df.shape)
df.head(2)
(150, 5)
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 150 entries, 0 to 149 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sepal_length 150 non-null float64 1 sepal_width 150 non-null float64 2 petal_length 150 non-null float64 3 petal_width 150 non-null float64 4 species 150 non-null object dtypes: float64(4), object(1) memory usage: 6.0+ KB
df.columns
Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
'species'],
dtype='object')
fig1 = px.scatter(df, x='sepal_length' , y='sepal_width', color="species")
fig1.show()
# save the plot
fig1.write_html("outputs/fig1.html")
# fig1.write_image("outputs/fig1.png", scale=3)
# sort the data
df_line = df.sort_values(by=['sepal_length'])
fig2 = px.line(df_line, x='sepal_length' , y='sepal_width', color="species")
fig2.show()
#save the plot
fig2.write_html("outputs/fig2.html")
# group values based on means
df_bar = df.groupby(['species']).mean().reset_index()
df_bar.head()
| species | sepal_length | sepal_width | petal_length | petal_width | |
|---|---|---|---|---|---|
| 0 | setosa | 5.006 | 3.428 | 1.462 | 0.246 |
| 1 | versicolor | 5.936 | 2.770 | 4.260 | 1.326 |
| 2 | virginica | 6.588 | 2.974 | 5.552 | 2.026 |
fig3 = px.bar(df_bar, x='species' , y='sepal_width')
fig3.show()
# save the plot
fig3.write_html("outputs/fig3.html")
# seaborn does not need to make mean groups
sns.barplot(df, x="species", y= "sepal_width")
plt.show()
# Box plot
fig4 = px.box(df, x='species' , y='sepal_width', color = 'species')
fig4.show()
# save the plot
fig4.write_html("outputs/fig4.html")
# violon plot
fig5 = px.violin(df, x='species' , y='sepal_width', color = 'species', box=True)
fig5.show()
# save the plot
fig5.write_html("outputs/fig5.html")
#6 histogram
fig6 = px.histogram(df, x='sepal_width',color = 'species')
fig6.show()
# save the plot
fig6.write_html("outputs/fig6.html")
# first we have to check the dataset
df['species'].value_counts()
setosa 50 versicolor 50 virginica 50 Name: species, dtype: int64
# piechart
df_pie = df['species'].value_counts().reset_index()
df_pie.columns = ['species', 'count']
fig7 = px.pie(df_pie, values='count', names='species')
fig7.show()
# save the plot
fig7.write_html("outputs/fig7.html")
# scatter 3d plot
fig8 = px.scatter_3d(df, x='sepal_length', y='sepal_width', z='petal_width', color='species')
fig8.show()
# save the plot in html
fig8.write_html("outputs/fig8.html")
# area chart
df_area = df.sort_values(by=['sepal_length'])
fig9 = px.area(df_area, x='sepal_width', y='sepal_length', color='species')
fig9.show()
# save the plot
fig9.write_html("outputs/fig9.html")
# bubble chart
fig10 = px.scatter(df, x='sepal_length', y='sepal_width', size='petal_length', color='species')
fig10.show()
fig10.write_html("outputs/fig10.html")
# 11 sunburst chart
df_sunburst = df.groupby(['species', 'petal_width']).size().reset_index(name='counts')
fig11 = px.sunburst(df_sunburst, path=['species', 'petal_width'], values='counts')
fig11.show()
# save this plot as html
fig11.write_html("outputs/fig11.html")
Let's try another sunburst plot
# create sunburst plot on titanic dataset
titanic = sns.load_dataset('titanic')
# Create a sunburst plot
fig = px.sunburst(titanic,
path=['sex','class', 'who', 'alive', 'alone'],
values='survived' , color='sex')
# Show the plot
fig.show()
fig.write_html("outputs/figg.html")
# Add a numeric species_id based on species names for coloring
df['species_id'] = df['species'].astype('category').cat.codes
# Parallel Coordinates Plot
fig12 = px.parallel_coordinates(df, color='species_id', labels={'species_id': 'species'}, color_continuous_scale=px.colors.diverging.Tealrose)
fig12.show()
# save the plot in html
fig12.write_html("outputs/fig12.html")
# Desity contour plot
fig13 = px.density_contour(df, x='sepal_length', y='sepal_width', color='species')
fig13.show()
# save this plot
fig13.write_html("outputs/fig13.html")
# Ternary Plot
fig14 = px.scatter_ternary(df, a='sepal_length', b='sepal_width', c='petal_width', color='species')
fig14.show()
# save this plot
fig14.write_html("outputs/fig14.html")
# Polar chart (Radar Chart)
df_radar = df.groupby(['species']).mean().reset_index()
fig15 = px.line_polar(df_radar, r='sepal_length', theta='species', line_close=True)
fig15.show()
# save the plot
fig15.write_html("outputs/fig15.html")
# create sunburst plot on titanic dataset
import plotly.express as px
import seaborn as sns
# Load Titanic dataset
df = sns.load_dataset('titanic')
# Create a sunburst plot
fig = px.sunburst(df,
path=['sex','class', 'who', 'alive', 'alone'],
values='survived' , color='sex')
# Show the plot
fig.show()
fig.write_html("outputs/fig17.html")
tips = sns.load_dataset('tips')
print(tips.shape)
tips.head(2)
(244, 7)
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
fig = px.sunburst(tips,
path=['sex','smoker', 'day', 'time'],
values='tip' , color='sex',
hover_data=['tip', 'day', 'time'])
# how can we add more values when we hover the pointer on the plot?
# Show the plot
fig.show()
fig.write_html("outputs/fig18.html")
df = px.data.gapminder()
print(df.shape)
df.head(2)
(1704, 8)
| country | continent | year | lifeExp | pop | gdpPercap | iso_alpha | iso_num | |
|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | Asia | 1952 | 28.801 | 8425333 | 779.445314 | AFG | 4 |
| 1 | Afghanistan | Asia | 1957 | 30.332 | 9240934 | 820.853030 | AFG | 4 |
fig = px.sunburst(df,
path=['continent','country', 'year'],
values='pop' , color='continent')
# how can we add more values when we hover the pointer on the plot?
# Show the plot
fig.show()
fig.write_html("outputs/fig19.html")
# animated plot
px.scatter(df, x= "gdpPercap",
y = "lifeExp",
size= "pop", color= "continent",
animation_frame='year', animation_group="country",
log_x=True, size_max=55, range_x=[100,100000], range_y=[5,100])
# animated plot
px.scatter(df, x= "gdpPercap",
y = "lifeExp",
size= "pop", color= "country",
animation_frame='year', animation_group="continent",
log_x=True, size_max=55, range_x=[100,100000], range_y=[5,100])
# save this animated plot in html format
fig = px.scatter(df, x= "gdpPercap",
y = "lifeExp",
size= "pop", color= "country",
animation_frame='year', animation_group="continent",
log_x=True, size_max=55, range_x=[100,100000], range_y=[5,100])
fig.show()
fig.write_html("outputs/gapminder.html")
import plotly.express as px
import pandas as pd
import numpy as np
import io
import PIL
fig = px.scatter(df, x= "gdpPercap",
y = "lifeExp",
size= "pop", color= "continent",
animation_frame='year', animation_group="country",
log_x=True, size_max=55, range_x=[100,100000], range_y=[5,100])
fig.show()
# generate images for each step in animation
frames = []
for s, fr in enumerate(fig.frames):
# set main traces to appropriate traces within plotly frame
fig.update(data=fr.data)
# move slider to correct place
fig.layout.sliders[0].update(active=s)
# generate image of current state
frames.append(PIL.Image.open(io.BytesIO(fig.to_image(format="png", scale=3))))
# create animated GIF
frames[0].save(
"outputs/gapminder.gif",
save_all=True,
append_images=frames[1:],
optimize=True,
duration=500,
loop=0,
dither=None # Turn off dithering
)
import plotly.express as px
df = px.data.election()
geojson = px.data.election_geojson()
print(df["district"][2])
print(geojson["features"][0]["properties"])
11-Sault-au-Récollet
{'district': '11-Sault-au-Récollet'}
df.head()
| district | Coderre | Bergeron | Joly | total | winner | result | district_id | |
|---|---|---|---|---|---|---|---|---|
| 0 | 101-Bois-de-Liesse | 2481 | 1829 | 3024 | 7334 | Joly | plurality | 101 |
| 1 | 102-Cap-Saint-Jacques | 2525 | 1163 | 2675 | 6363 | Joly | plurality | 102 |
| 2 | 11-Sault-au-Récollet | 3348 | 2770 | 2532 | 8650 | Coderre | plurality | 11 |
| 3 | 111-Mile-End | 1734 | 4782 | 2514 | 9030 | Bergeron | majority | 111 |
| 4 | 112-DeLorimier | 1770 | 5933 | 3044 | 10747 | Bergeron | majority | 112 |
fig = px.choropleth_mapbox(df, geojson=geojson, color="Coderre",
locations="district", featureidkey="properties.district",
center={"lat": 45.5517, "lon": -73.7073},
mapbox_style="carto-positron", zoom=9)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
fig.write_html("outputs/fig20.html")
data = {
'Country': ['China', 'India', 'United States', 'Indonesia', 'Pakistan'],
'Population': [1444216107, 1393409038, 332915073, 276361783, 225199937]
}
# Create a pandas DataFrame
df = pd.DataFrame(data)
# Display the DataFrame
df
| Country | Population | |
|---|---|---|
| 0 | China | 1444216107 |
| 1 | India | 1393409038 |
| 2 | United States | 332915073 |
| 3 | Indonesia | 276361783 |
| 4 | Pakistan | 225199937 |
df = pd.read_csv("DATA/covid-data.csv")
print(df.shape)
df.head(2)
(302512, 67)
| iso_code | continent | location | date | total_cases | new_cases | new_cases_smoothed | total_deaths | new_deaths | new_deaths_smoothed | total_cases_per_million | new_cases_per_million | new_cases_smoothed_per_million | total_deaths_per_million | new_deaths_per_million | new_deaths_smoothed_per_million | reproduction_rate | icu_patients | icu_patients_per_million | hosp_patients | hosp_patients_per_million | weekly_icu_admissions | weekly_icu_admissions_per_million | weekly_hosp_admissions | weekly_hosp_admissions_per_million | total_tests | new_tests | total_tests_per_thousand | new_tests_per_thousand | new_tests_smoothed | new_tests_smoothed_per_thousand | positive_rate | tests_per_case | tests_units | total_vaccinations | people_vaccinated | people_fully_vaccinated | total_boosters | new_vaccinations | new_vaccinations_smoothed | total_vaccinations_per_hundred | people_vaccinated_per_hundred | people_fully_vaccinated_per_hundred | total_boosters_per_hundred | new_vaccinations_smoothed_per_million | new_people_vaccinated_smoothed | new_people_vaccinated_smoothed_per_hundred | stringency_index | population_density | median_age | aged_65_older | aged_70_older | gdp_per_capita | extreme_poverty | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | handwashing_facilities | hospital_beds_per_thousand | life_expectancy | human_development_index | population | excess_mortality_cumulative_absolute | excess_mortality_cumulative | excess_mortality | excess_mortality_cumulative_per_million | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Asia | Afghanistan | 2020-01-03 | NaN | 0.0 | NaN | NaN | 0.0 | NaN | NaN | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 54.422 | 18.6 | 2.581 | 1.337 | 1803.987 | NaN | 597.029 | 9.59 | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | 41128772.0 | NaN | NaN | NaN | NaN |
| 1 | AFG | Asia | Afghanistan | 2020-01-04 | NaN | 0.0 | NaN | NaN | 0.0 | NaN | NaN | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 54.422 | 18.6 | 2.581 | 1.337 | 1803.987 | NaN | 597.029 | 9.59 | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | 41128772.0 | NaN | NaN | NaN | NaN |
df_total_cases = df.groupby(['location'])['total_cases'].mean().reset_index()
df_total_cases
| location | total_cases | |
|---|---|---|
| 0 | Afghanistan | 1.198330e+05 |
| 1 | Africa | 7.240734e+06 |
| 2 | Albania | 1.734666e+05 |
| 3 | Algeria | 1.678651e+05 |
| 4 | American Samoa | 5.085308e+03 |
| ... | ... | ... |
| 250 | Western Sahara | NaN |
| 251 | World | 2.910590e+08 |
| 252 | Yemen | 7.537913e+03 |
| 253 | Zambia | 1.850571e+05 |
| 254 | Zimbabwe | 1.334758e+05 |
255 rows × 2 columns
# Create the map visualization
fig = px.choropleth(df_total_cases, locations='location', locationmode='country names', color = 'total_cases',
title='Total Cases by Country', range_color=[0, 10000000])
# Display the map
fig.show()
fig.write_html("outputs/fig21.html")
px.choropleth(df, locations='location', locationmode='country names', color = 'new_cases',
title='Total Cases by Country', range_color=[0, 10000],
animation_frame= 'date')
# group df based on each month in date and take average of new_cases
# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])
# Create a new column 'year_month' to represent the year and month
df['year_month'] = df['date'].dt.to_period('M')
# Now group by 'year_month' and 'location', and sum the 'new_cases' for each group
monthly_cases = df.groupby(['year_month', 'location']).agg({'total_cases': 'sum'}).reset_index()
# Convert 'year_month' back to a string format for readability
monthly_cases['year_month'] = monthly_cases['year_month'].astype(str)
# This will give you a new DataFrame with the sum of new cases for each month and location
monthly_cases.head()
| year_month | location | total_cases | |
|---|---|---|---|
| 0 | 2020-01 | Afghanistan | 0.0 |
| 1 | 2020-01 | Africa | 0.0 |
| 2 | 2020-01 | Albania | 0.0 |
| 3 | 2020-01 | Algeria | 0.0 |
| 4 | 2020-01 | American Samoa | 0.0 |
fig = px.choropleth(monthly_cases, locations='location', locationmode='country names', color = 'total_cases',
title='Total Covid Cases Worldwide (January 2021 -to- April 2023)',
range_color=[0, monthly_cases['total_cases'].quantile(0.85)],
animation_frame= 'year_month', color_continuous_scale='viridis',
labels={'year_month':'Year-Month', 'total_cases':'Total Cases'}
)
# increase the size of the map
fig.update_layout(height=600, width=800)
# saving animated gif
import plotly.express as px
import pandas as pd
import numpy as np
import io
import PIL
fig = px.choropleth(monthly_cases, locations='location', locationmode='country names', color = 'total_cases',
title='Total Covid Cases Worldwide (January 2021 -to- April 2023)',
range_color=[0, monthly_cases['total_cases'].quantile(0.85)],
animation_frame= 'year_month', color_continuous_scale='viridis',
labels={'year_month':'Year-Month', 'total_cases':'Total Cases'}
)
fig.show()
fig.write_html("outputs/fig22.html")
# increase the size of the map
fig.update_layout(height=600, width=800)
# generate images for each step in animation
frames = []
for s, fr in enumerate(fig.frames):
# set main traces to appropriate traces within plotly frame
fig.update(data=fr.data)
# move slider to correct place
fig.layout.sliders[0].update(active=s)
# generate image of current state
frames.append(PIL.Image.open(io.BytesIO(fig.to_image(format="png", scale=3))))
# create animated GIF
frames[0].save(
"./output/total_covid_cases_worldwide.gif",
save_all=True,
append_images=frames[1:],
optimize=True,
duration=500, # milliseconds per frame
loop=0, # infinite loop
dither=None # Turn off dithering
)